{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "NaiveBayes.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.2"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WDqA-VKvfWix",
        "colab_type": "text"
      },
      "source": [
        "# 第4章 朴素贝叶斯"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rMWMEdyUfWix",
        "colab_type": "text"
      },
      "source": [
        "基于贝叶斯定理与特征条件独立假设的分类方法。\n",
        "\n",
        "模型：\n",
        "\n",
        "- 高斯模型\n",
        "- 多项式模型\n",
        "- 伯努利模型"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mahnF7NFfWiy",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "%matplotlib inline\n",
        "\n",
        "from sklearn.datasets import load_iris\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "from collections import Counter\n",
        "import math"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6tRQt9QFf27Y",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# 例 4.1 \n",
        "lambda_ = 0.2\n",
        "x = [2, 'S']\n",
        "\n",
        "X1 = [1,2,3]\n",
        "X2 = ['S', 'M', 'L']\n",
        "Y = [1, -1]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CCzHK_v2hafm",
        "colab_type": "text"
      },
      "source": [
        "$P_\\lambda(Y=1)=(9+lambda\\_)/(15 + 2*lambda\\_) = (9+0.2)/(15+2*0.2)=0.5974025974025974$\n",
        "$P_\\lambda(Y=-1)=(6+lambda\\_)/(15 + 2*lambda\\_) = (6+0.2)/(15+2*0.2)=0.40259740259740264$  \n",
        "$P(X^{(1)}=1|Y=1) = (2+0.2)/(9+3*0.2)=0.22916666666666669 $  \n",
        "$P(X^{(1)}=2|Y=1) = (3+0.2)/(9+3*0.2)=0.33333333333333337 $  \n",
        "$P(X^{(1)}=3|Y=1) = (4+0.2)/(9+3*0.2)=0.43750000000000006 $  \n",
        "$P(X^{(2)}=S|Y=1) = (1+0.2)/(9+3*0.2)=0.125 $   \n",
        "$P(X^{(2)}=M|Y=1) = (4+0.2)/(9+3*0.2)=0.43750000000000006 $     \n",
        "$P(X^{(2)}=L|Y=1) = (4+0.2)/(9+3*0.2)=0.43750000000000006 $  \n",
        "$P(X^{(1)}=1|Y=-1) = (3+0.2)/(6+3*0.2)=0.4848484848484849 $  \n",
        "$P(X^{(1)}=2|Y=-1) = (2+0.2)/(6+3*0.2)=0.33333333333333337 $   \n",
        "$P(X^{(1)}=3|Y=-1) = (1+0.2)/(6+3*0.2)=0.18181818181818182 $   \n",
        "$P(X^{(2)}=S|Y=-1) = (3+0.2)/(6+3*0.2)=0.4848484848484849 $  \n",
        "$P(X^{(2)}=M|Y=-1) = (2+0.2)/(6+3*0.2)=0.33333333333333337 $   \n",
        "$P(X^{(2)}=L|Y=-1) = (1+0.2)/(6+3*0.2)=0.18181818181818182 $   \n",
        "so  \n",
        "$P(Y=1)P(X^{(1)}=2|Y=1)P(X^{(2)}=S|Y=1) =0.5974025974025974* 0.33333333333333337*0.125=0.024891774891774892$  \n",
        "$P(Y=-1)P(X^{(1)}=2|Y=-1)P(X^{(2)}=S|Y=-1) =0.40259740259740264* 0.33333333333333337*0.4848484848484849=0.06506624688442873$  \n",
        "\n",
        "so, it should be -1."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "86WQkGZefWi1",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class NB:\n",
        "    def __init__(self, lambda_):\n",
        "        self.lambda_ = lambda_\n",
        "        \n",
        "    def fit(self, X, y):\n",
        "        N, M = X.shape\n",
        "        data = np.hstack((X, y.reshape(N, 1)))\n",
        "        \n",
        "        py = {}\n",
        "        pxy = {}\n",
        "        uniquey, countsy = np.unique(y, return_counts=True)\n",
        "        tmp = dict(zip(uniquey, countsy))\n",
        "        for k,v in tmp.items():\n",
        "            py[k] = (v + self.lambda_)/(N + len(uniquey) * self.lambda_)\n",
        "            tmp_data = data[data[:, -1] == k]\n",
        "            for col in range(M):\n",
        "                uniquecol, countscol = np.unique(tmp_data[:,col], return_counts=True)\n",
        "                tmp1 = dict(zip(uniquecol, countscol))\n",
        "                for kk, vv in tmp1.items():\n",
        "                    pxy['X({})={}|Y={}'.format(col+1, kk, k)] = (vv + self.lambda_)/(v + len(uniquecol) * self.lambda_)\n",
        "                    \n",
        "        self.py = py\n",
        "        self.pxy = pxy\n",
        "\n",
        "        #return self.py, self.pxy\n",
        "    \n",
        "    def predict(self, x):\n",
        "        M = len(x)\n",
        "        res = {}\n",
        "        for k,v in self.py.items():\n",
        "            p = v\n",
        "            for i in range(len(x)):\n",
        "                p = p * self.pxy['X({})={}|Y={}'.format(i+1, x[i], k)]\n",
        "            res[k] = p\n",
        "        print(res)\n",
        "        maxp = -1\n",
        "        maxk = -1\n",
        "        for kk,vv in res.items():\n",
        "            if vv > maxp:\n",
        "                maxp = vv\n",
        "                maxk = kk\n",
        "                \n",
        "        return maxk"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3hPRglhJfWi3",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "lambda_ = 0.2\n",
        "d = {'S':0, 'M':1, 'L':2}\n",
        "\n",
        "X = np.array([[1, d['S']], [1, d['M']], [1, d['M']],\n",
        "             [1, d['S']], [1, d['S']], [2, d['S']],\n",
        "             [2, d['M']], [2, d['M']], [2, d['L']],\n",
        "             [2, d['L']], [3, d['L']], [3, d['M']],\n",
        "             [3, d['M']], [3, d['L']], [3, d['L']]])\n",
        "\n",
        "y = np.array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fs8vvcpWfWi5",
        "colab_type": "code",
        "outputId": "5b1fafb5-b4de-4618-fdad-baa42385751f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 287
        }
      },
      "source": [
        "X"
      ],
      "execution_count": 129,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[1, 0],\n",
              "       [1, 1],\n",
              "       [1, 1],\n",
              "       [1, 0],\n",
              "       [1, 0],\n",
              "       [2, 0],\n",
              "       [2, 1],\n",
              "       [2, 1],\n",
              "       [2, 2],\n",
              "       [2, 2],\n",
              "       [3, 2],\n",
              "       [3, 1],\n",
              "       [3, 1],\n",
              "       [3, 2],\n",
              "       [3, 2]])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 129
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FM89mRoZRiP1",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "93afb1dd-3be8-4c55-cbe0-f0549365e38a"
      },
      "source": [
        "y"
      ],
      "execution_count": 130,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([-1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 130
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l08zoOrQRlcN",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "outputId": "71096a11-5d50-4122-84ab-e0ab415dfc81"
      },
      "source": [
        "model = NB(lambda_)\n",
        "model.fit(X,y)\n",
        "model.predict(np.array([2, 0]))"
      ],
      "execution_count": 77,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{-1: 0.06506624688442873, 1: 0.024891774891774892}\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "-1"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 77
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wULij7sgcQna",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# data\n",
        "def create_data():\n",
        "    iris = load_iris()\n",
        "    df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
        "    df['label'] = iris.target\n",
        "    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']\n",
        "    data = np.array(df.iloc[:100, :])\n",
        "    # print(data)\n",
        "    return data[:,:-1], data[:,-1]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wniDd3wMcTRW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "X, y = create_data()\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G6NBwGCxcUur",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "e85d5a75-de23-4ebf-fb00-7d1ce166b0ee"
      },
      "source": [
        "X_test[0], y_test[0]"
      ],
      "execution_count": 80,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(array([5.6, 3. , 4.5, 1.5]), 1.0)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 80
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "J_xJWo5GcVya",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "03afa1be-a569-4857-d05d-7b3f70cc9a02"
      },
      "source": [
        "X_train.shape"
      ],
      "execution_count": 82,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(70, 4)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 82
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GyXsq6VvfWi-",
        "colab_type": "text"
      },
      "source": [
        "## GaussianNB 高斯朴素贝叶斯\n",
        "\n",
        "特征的可能性被假设为高斯\n",
        "\n",
        "概率密度函数：\n",
        "$$P(x_i | y_k)=\\frac{1}{\\sqrt{2\\pi\\sigma^2_{yk}}}exp(-\\frac{(x_i-\\mu_{yk})^2}{2\\sigma^2_{yk}})$$\n",
        "\n",
        "数学期望(mean)：$\\mu$，方差：$\\sigma^2=\\frac{\\sum(X-\\mu)^2}{N}$"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uqJnMoUrfWi-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class NaiveBayes:\n",
        "    def fit(self, X, y):\n",
        "        self.classes = list(np.unique(y))\n",
        "        self.parameters = {}\n",
        "        \n",
        "        for c in self.classes:\n",
        "            # 计算每个种类的平均值，方差，先验概率\n",
        "            X_Index_c = X[np.where(y == c)]\n",
        "            X_index_c_mean = np.mean(X_Index_c, axis=0, keepdims=True)\n",
        "            X_index_c_var = np.var(X_Index_c, axis=0, keepdims=True)\n",
        "            parameters = {\"mean\": X_index_c_mean, \"var\": X_index_c_var, \"prior\": X_Index_c.shape[0] / X.shape[0]}\n",
        "            self.parameters[\"class\" + str(c)] = parameters\n",
        "            print(self.parameters)\n",
        "            \n",
        "    def _pdf(self, X, classes):\n",
        "        # 一维高斯分布的概率密度函数\n",
        "        eps = 1e-4\n",
        "        mean = self.parameters[\"class\" + str(classes)][\"mean\"]\n",
        "        var = self.parameters[\"class\" + str(classes)][\"var\"]\n",
        "        \n",
        "        numerator = np.exp(-(X - mean) ** 2 / (2 * var + eps))\n",
        "        denominator = np.sqrt(2 * np.pi * var + eps)\n",
        "        \n",
        "        # 取对数防止数值溢出\n",
        "        result = np.sum(np.log(numerator / denominator), axis=1, keepdims=True)\n",
        "        \n",
        "        return result.T\n",
        "    \n",
        "    def _predict(self, X):\n",
        "        output = []\n",
        "        for y in self.classes:\n",
        "            prior = np.log(self.parameters[\"class\" + str(y)][\"prior\"])\n",
        "            posterior = self._pdf(X, y)\n",
        "            prediction = prior + posterior\n",
        "            output.append(prediction)\n",
        "        return output\n",
        "    \n",
        "    def predict(self, X):\n",
        "        # 取概率最大的类别返回预测值\n",
        "        output = self._predict(X)\n",
        "        output = np.reshape(output, (len(self.classes), X.shape[0]))\n",
        "        prediction = np.argmax(output, axis=0)\n",
        "        return prediction\n",
        "    \n",
        "    def score(self, X_test, y_test):\n",
        "        right = 0\n",
        "        pred = self.predict(X_test)\n",
        "        right = (y_test - pred == 0).sum()\n",
        "\n",
        "        return right / float(len(X_test))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NpeBcwKJfWjA",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "model = NaiveBayes()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JLj3a70GfWjD",
        "colab_type": "code",
        "outputId": "ef5182a8-668e-4f62-93fb-314f95f68220",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 73
        }
      },
      "source": [
        "model.fit(X_train, y_train)"
      ],
      "execution_count": 123,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'class0.0': {'mean': array([[5.02571429, 3.42857143, 1.49142857, 0.24857143]]), 'var': array([[0.10648163, 0.15918367, 0.02478367, 0.01278367]]), 'prior': 0.5}}\n",
            "{'class0.0': {'mean': array([[5.02571429, 3.42857143, 1.49142857, 0.24857143]]), 'var': array([[0.10648163, 0.15918367, 0.02478367, 0.01278367]]), 'prior': 0.5}, 'class1.0': {'mean': array([[5.94285714, 2.77714286, 4.27428571, 1.34      ]]), 'var': array([[0.18816327, 0.09833469, 0.17505306, 0.03954286]]), 'prior': 0.5}}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "x9PDXudxfWjF",
        "colab_type": "code",
        "outputId": "68a6c9c7-9524-47fe-800e-d8a2d080ddea",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "print(model.predict(X_test))"
      ],
      "execution_count": 124,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[1 0 0 0 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xMO7vvVvfWjI",
        "colab_type": "code",
        "outputId": "86148657-b876-4277-b3e0-678254b0ddaf",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "model.score(X_test, y_test)"
      ],
      "execution_count": 125,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1.0"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 125
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "collapsed": true,
        "id": "TGnmFQFkfWjL",
        "colab_type": "text"
      },
      "source": [
        "scikit-learn实例\n",
        "\n",
        "# sklearn.naive_bayes"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EBKRlWmsfWjM",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from sklearn.naive_bayes import GaussianNB"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "S7Q8mOzmfWjO",
        "colab_type": "code",
        "outputId": "d7fbefa1-b855-4ce5-9352-81049d510232",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "clf = GaussianNB()\n",
        "clf.fit(X, y)"
      ],
      "execution_count": 133,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "GaussianNB(priors=None, var_smoothing=1e-09)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 133
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BdAKVtxXfWjT",
        "colab_type": "code",
        "outputId": "e994d698-ee17-4c39-aa4f-0ed491f7fad5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "clf.predict([[2, 0]])"
      ],
      "execution_count": 134,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([-1])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 134
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "7qit4xK_0aka",
        "colab": {}
      },
      "source": [
        "from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l4sFBX_u0drg",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "a7a74c51-e62e-46d1-b966-d00b7ac266f8"
      },
      "source": [
        "clf1 = BernoulliNB()\n",
        "clf1.fit(X, y)\n",
        "clf1.predict([[2, 0]])"
      ],
      "execution_count": 138,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([-1])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 138
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QEClQeuV0qw2",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "bef9b428-fc41-4a06-b120-8c66ab380a56"
      },
      "source": [
        "clf2 = MultinomialNB()\n",
        "clf2.fit(X, y)\n",
        "clf2.predict([[2, 0]])"
      ],
      "execution_count": 139,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([1])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 139
        }
      ]
    }
  ]
}